In [1]:
filing = EdgarSDFiling.objects.get(pk=711)
docs = filing.edgarsdfilingdocument_set.all()

In [2]:
doc_a = docs[0]
content_a = doc_a.edgardocumentcontent_set.get()
doc_a.description


Out[2]:
'Complete submission text file'

In [3]:
doc_b = docs[4]
content_b = doc_b.edgardocumentcontent_set.get()
doc_b.description


Out[3]:
'SD'

In [4]:
content_a.content[0:200]


Out[4]:
'<SEC-DOCUMENT>0001104659-17-036452.txt : 20170531\n<SEC-HEADER>0001104659-17-036452.hdr.sgml : 20170531\n<ACCEPTANCE-DATETIME>20170531113151\nACCESSION NUMBER:\t\t0001104659-17-036452\nCONFORMED SUBMISSION '

In [5]:
content_b.content[0:200]


Out[5]:
'<DOCUMENT>\n<TYPE>SD\n<SEQUENCE>1\n<FILENAME>a17-14316_1sd.htm\n<DESCRIPTION>SD\n<TEXT>\n\n\n<html>\n<head>\n\n\n\n\n  </head>\n<body link=blue lang="EN-US">\n<div style="font-family:Times New Roman;">\n<div style="bo'

In [98]:
import toolz
from urlextract import URLExtract
extractor = URLExtract()

In [7]:
all_urls = []
for doc in docs:
    try:
        # Get the doc content
        doc_content = doc.edgardocumentcontent_set.get()
    except EdgarDocumentContent.DoesNotExist:
        continue
    content = doc_content.content
    if content:
        print(doc_content.id)
        urls = extractor.find_urls(content)
        if urls:
            all_urls.extend(urls)
unique_urls = toolz.unique(all_urls)
print(list(unique_urls))


1012
1013
1014
['http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/&#160', 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/', 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/).']

In [8]:
for doc in docs:
    print(doc.id)


4029
4028
4027
4026
4025

In [9]:
extract_urls = docs.values_list('edgardocumentcontent__urls', flat=True)

In [10]:
from toolz import filter, accumulate

def compact(iter):
    return filter(None, iter)

In [11]:
compacted = list(compact(extract_urls))
compacted


Out[11]:
[['http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/&#160',
  'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/',
  'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/).'],
 ['http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/',
  'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/).'],
 ['http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/&#160']]

In [12]:
from itertools import chain
flattened = list(chain.from_iterable(compacted))
flattened


Out[12]:
['http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/&#160',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/).',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/).',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/&#160']

In [13]:
list(toolz.unique(flattened))


Out[13]:
['http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/&#160',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/).']

In [14]:
filing.extracted_urls


Out[14]:
['http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/&#160',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/',
 'http://www.3m.com/3M/en_US/suppliers-direct/supplier-requirements/supplier-responsibility-expectations/).']

Investigate WestRock (and others)

NUCOR CORP, Owens Corning, NOVARTIS AG, James Hardie Industries plc, GENERAL DYNAMICS CORP, ESTEE LAUDER COMPANIES INC, ALBEMARLE CORP


In [16]:
filing = EdgarSDFiling.objects.get(pk=775)

In [19]:
docs = filing.edgarsdfilingdocument_set.all()
docs


Out[19]:
<QuerySet [<EdgarSDFilingDocument: WestRock Co - 2016 (SD) - 0001171843-16-010390.txt>, <EdgarSDFilingDocument: WestRock Co - 2016 (SD) - exh_101.htm>, <EdgarSDFilingDocument: WestRock Co - 2016 (SD) - fsd_053116.htm>]>

In [30]:
all_urls = []
for doc in docs:
    try:
        # Get the doc content
        doc_content = doc.edgardocumentcontent_set.get()
    except EdgarDocumentContent.DoesNotExist:
        continue
    content = doc_content.content
    if content:
        extractor = URLExtract()
        urls = extractor.find_urls(content)
        print(urls)
        if urls:
            all_urls.extend(urls)
unique_urls = toolz.unique(all_urls)
print(list(unique_urls))


[]
[]
[]
[]

In [107]:
all_urls = []
for doc in docs:
    try:
        # Get the doc content
        doc_content = doc.edgardocumentcontent_set.get()
    except EdgarDocumentContent.DoesNotExist:
        continue
    content = doc_content.content
    if content:
        extractor = URLExtract()
        urls = extractor.find_urls(content.replace('.com.', '.com'))
        print(urls)
        if urls:
            all_urls.extend(urls)
unique_urls = toolz.unique(all_urls)
print(list(unique_urls))


['www.westrock.com', 'www.westrock.com', 'www.westrock.com']
['www.westrock.com', 'www.westrock.com']
['www.westrock.com']
['www.westrock.com']

In [ ]: